Lets start with LASSO
bwt_df =
read_csv("./data/extra_topic_data/birthweight.csv") %>%
janitor::clean_names() %>%
mutate(
babysex = as.factor(babysex),
babysex = fct_recode(babysex, "male" = "1", "female" = "2"),
frace = as.factor(frace),
frace = fct_recode(frace, "white" = "1", "black" = "2", "asian" = "3",
"puerto rican" = "4", "other" = "8"),
malform = as.logical(malform),
mrace = as.factor(mrace),
mrace = fct_recode(mrace, "white" = "1", "black" = "2", "asian" = "3",
"puerto rican" = "4")) %>%
sample_n(200)
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
To use the lasso, we will use glmnet
y = bwt_df$bwt
x = model.matrix(bwt ~ ., bwt_df)[,-1]
lasso_fit = glmnet(x, y)
Make sure you set those lambdas to make sure its the same
lambda = 10^(seq(3, -2, -0.1))
lasso_fit =
glmnet(x, y, lambda = lambda)
lasso_cv =
cv.glmnet(x, y, lambda = lambda)
lambda_opt = lasso_cv$lambda.min
Still going to use the broom packeage on the lasso fit because it will look better
broom::tidy(lasso_fit) %>%
select(term, lambda, estimate) %>%
complete(term, lambda, fill = list(estimate = 0) )
## # A tibble: 1,020 x 3
## term lambda estimate
## <chr> <dbl> <dbl>
## 1 (Intercept) 0.01 -4524.
## 2 (Intercept) 0.0126 -4510.
## 3 (Intercept) 0.0158 -4493.
## 4 (Intercept) 0.0200 -4472.
## 5 (Intercept) 0.0251 -4444.
## 6 (Intercept) 0.0316 -4411.
## 7 (Intercept) 0.0398 -4367.
## 8 (Intercept) 0.0501 -4312.
## 9 (Intercept) 0.0631 -4242.
## 10 (Intercept) 0.0794 -4152.
## # … with 1,010 more rows
broom::tidy(lasso_fit) %>%
select(term, lambda, estimate) %>%
complete(term, lambda, fill = list(estimate = 0) ) %>%
filter(term != "(Intercept)") %>%
ggplot(aes(x = log(lambda, 10), y = estimate, group = term, color = term)) +
geom_path() +
geom_vline(xintercept = log(lambda_opt, 10), color = "blue", size = 1.2) +
theme(legend.position = "none")
This is the cross validation part
broom::tidy(lasso_cv) %>%
ggplot(aes(x = log(lambda, 10), y = estimate)) +
geom_point()
The blue line in the first model is the smallest lamnda will be for the set of predictions thats are present from the second graph.
poke_df =
read_csv("./data/extra_topic_data/pokemon.csv") %>%
janitor::clean_names() %>%
select(hp, speed)
## Parsed with column specification:
## cols(
## `#` = col_double(),
## Name = col_character(),
## `Type 1` = col_character(),
## `Type 2` = col_character(),
## Total = col_double(),
## HP = col_double(),
## Attack = col_double(),
## Defense = col_double(),
## `Sp. Atk` = col_double(),
## `Sp. Def` = col_double(),
## Speed = col_double(),
## Generation = col_double(),
## Legendary = col_logical()
## )
poke_df %>%
ggplot(aes(x = hp, y = speed)) +
geom_point()
We clustersing now
kmeans_fit =
kmeans(x = poke_df, centers = 3)
We processing and plotting
poke_df =
broom::augment(kmeans_fit, poke_df)
poke_df %>%
ggplot(aes(x = hp, y = speed, color = .cluster)) +
geom_point()
traj_data =
read_csv("./data/extra_topic_data/trajectories.csv")
## Parsed with column specification:
## cols(
## subj = col_double(),
## week = col_double(),
## value = col_double()
## )
traj_data %>%
ggplot(aes(x = week, y = value, group = subj)) +
geom_point() +
geom_path()
Now we need to get intercepts and slopes for eveyone
int_slope_df =
traj_data %>%
nest(data = week:value) %>%
mutate(
models = map(data, ~lm(value ~ week, data = .x)),
result = map(models, broom::tidy)
) %>%
select(-data, -models) %>%
unnest(result) %>%
select(subj, term, estimate) %>%
pivot_wider(
names_from = term,
values_from = estimate
) %>%
rename(int = "(Intercept)", slope = week)
Try to kmeans this but first we gonna plot the intercept and slope
int_slope_df %>%
ggplot(aes(x = int, y = slope)) +
geom_point()
Have to do some processing, because it will treat the sunject as something to be clustered but thats not the case
km_fit =
kmeans(
x = int_slope_df %>%
select(-subj) %>%
scale, centers = 2)
int_slope_df =
broom::augment(km_fit, int_slope_df)
We clustered this
int_slope_df %>%
ggplot(aes(x = int, y = slope, color = .cluster)) +
geom_point()
We joined the two together
left_join(traj_data, int_slope_df) %>%
ggplot(aes(x = week, y = value, group = subj, color = .cluster)) +
geom_point() +
geom_path()
## Joining, by = "subj"